#!/usr/bin/python

import sys
from numpy import prod
reload(sys)  # Reload does the trick!
sys.setdefaultencoding('UTF-8')
import pickle
import unicodecsv as csv
import re
import unidecode
import pandas as pd

# Need set python path. This is a hack and will
# be fixed!
#sys.path.append('/home/tcoan/git_repos/DynamicNMF/dynamic-nmf')
dynamic_nmf_path = "/home/constantine/Dropbox/cards/baby-cards/dynamic-nmf"
sys.path.append(dynamic_nmf_path)

from text.slices import TimeSlices
from text.prepare import create_text_files
from text.prepare import process_text
import unsupervised
from estimate.window import FitWindow
from estimate.dynamic import FitDynamic
from estimate.dynamic import GetCoherence


#------------------------------------------------------------------
# Read data and initialize data
        
# Change working directory to make life easier
import os
os.chdir(dynamic_nmf_path)

# Load the sample data. This is the CTT data for baby-cards
with open('./data/skeptics.pkl', 'r') as pfile:
    content = pickle.load(pfile)

# Remove unicode for the semantic coherence measure to work properly.
for row in content:
    row['text_final'] = unicode(unidecode.unidecode(row['text_postprocessed']))

# Get the "time slice" data needed for the DtmNmfModel class
ts = TimeSlices(content, 'date')
time_slice_data = ts.get('text_final')

# Get only data after 2000. Each window needs enough data to
# run or it will error out with  not helpful message!
time_slice_data = [row for row in time_slice_data if row[-1] >= 2007]

#---------------------------------------------------------------------
# Prepare data for nmf

# Create data that dynamicnmf likes
dpaths = create_text_files(time_slice_data, './data', overwrite = True)

# Extract paths
paths = [row[-1] for row in dpaths]

# Process text
texts = process_text(paths)


#---------------------------------------------------------------------
# Estimate topics

# Window topics
ks = [15, 20, 15, 25, 30, 15, 20, 35, 20, 15, 20]
fw = FitWindow(texts, ks)
window_topics = fw.fit()


# Sweep dynamic topics given estimated window topics.  Return list of median coherence in descending order.
d_min = 10
d_max = 125
step = 5
coh = GetCoherence(time_slice_data, window_topics, d_min, d_max, step)
coherence_metrics = coh.DynamicTopicCoherence()
max_coherence = coherence_metrics[0] # keep solution with highest median topic coherence
max_k = max_coherence['k']

# generate dynamic topic keywords and document-topic weight matrix
print 'Estimating dynamic topic model with k = {}'.format(max_k)
fd = FitDynamic(window_topics)
fd.fit(max_k, verbose = True)
keywords = fd.get_dynamic_topics(20)
document_topic_matrix = fd.get_document_topics()



#-----------------------------------------------
# write keywords and topic data to disk
print "Saving data to disk..."
corpus = 'skeptics'
save_folder = '/home/constantine/Dropbox/cards/baby-cards/papers/ctts_vs_ngos/data/skeptics'

#construct labels for dt_matrix
labels = ['nmf_docid']
id = 0
while id <= max_k-1:
    labels.append('topic'+str(id))
    id += 1
#get docids
docids = [row[0] for row in time_slice_data]
#get correct content
filtered_content = [row for row in content if row['docid'] in docids]

#create dataframes
df_docids = pd.DataFrame(docids, columns=['docid']) # document ids
df_dtmatrix = pd.DataFrame(document_topic_matrix, columns=labels) #document-dynamic topic matrix
df_dynamic_topics = pd.concat([df_docids, df_dtmatrix], axis=1)  # merge docids with document-topic matrix
df_data = pd.DataFrame(filtered_content) # filtered content
df_coherence = pd.DataFrame(coherence_metrics) # coherence metrics
df_keywords = pd.DataFrame(keywords) # dynamic topic top terms

# merge data and save to disk
df_merged = pd.merge(df_data, df_dynamic_topics, on=['docid'], sort=False)
path = '{}/{}_k{}_dtmatrix.csv'.format(save_folder, corpus, max_k)
df_merged.to_csv(path, encoding='utf-8', index=None)
# drop text and save dt matrix
df_merged = df_merged.drop(['text_postprocessed', 'text', 'title', 'text_final'], axis=1)
path = '{}/{}_k{}_dtmatrix_compact.csv'.format(save_folder, corpus, max_k)
df_merged.to_csv(path, encoding='utf-8', index=None)
# save keywords
path = '{}/{}_k{}_keywords.csv'.format(save_folder, corpus, max_k)
df_keywords.to_csv(path, encoding='utf-8', index=None)
# save coherence metrics
path = '{}/{}_coherence_metrics.csv'.format(save_folder, corpus)
df_coherence.to_csv(path, encoding='utf-8', index=None)

print "Data saved."



